{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open(\"./data/KnowledgeTracing.txt\", \"r\") as f:\n",
    "    subjectList = [line.strip().split(\"\\t\") for line in f.readlines()]\n",
    "\n",
    "subjectList[0][0] = \"student_id\"\n",
    "subjectList[0][1] = \"question_id\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "newSubjectList = [item for item in subjectList if len(item) == 6]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "df = pd.DataFrame(newSubjectList[1:], columns=newSubjectList[0])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>question_id</th>\n",
       "      <th>is_correct</th>\n",
       "      <th>end_time</th>\n",
       "      <th>skill_id</th>\n",
       "      <th>skill_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>204825447039028</td>\n",
       "      <td>2011197</td>\n",
       "      <td>2</td>\n",
       "      <td>1534149419152</td>\n",
       "      <td>33685784</td>\n",
       "      <td>展开图折叠成几何体</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>204825447039028</td>\n",
       "      <td>2000021</td>\n",
       "      <td>3</td>\n",
       "      <td>1534149898771</td>\n",
       "      <td>33685509</td>\n",
       "      <td>数轴</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>204825447039028</td>\n",
       "      <td>2000054</td>\n",
       "      <td>0</td>\n",
       "      <td>1534149898771</td>\n",
       "      <td>33685507</td>\n",
       "      <td>正数与负数</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>204825447039028</td>\n",
       "      <td>2047649</td>\n",
       "      <td>2</td>\n",
       "      <td>1534149909218</td>\n",
       "      <td>33685783</td>\n",
       "      <td>几何体的展开图</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>204825447039028</td>\n",
       "      <td>2047649</td>\n",
       "      <td>2</td>\n",
       "      <td>1534149909218</td>\n",
       "      <td>33685548</td>\n",
       "      <td>列代数式</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "        student_id question_id is_correct       end_time  skill_id skill_name\n",
       "0  204825447039028     2011197          2  1534149419152  33685784  展开图折叠成几何体\n",
       "1  204825447039028     2000021          3  1534149898771  33685509         数轴\n",
       "2  204825447039028     2000054          0  1534149898771  33685507      正数与负数\n",
       "3  204825447039028     2047649          2  1534149909218  33685783    几何体的展开图\n",
       "4  204825447039028     2047649          2  1534149909218  33685548       列代数式"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "newDf = df.sort_values(['student_id', 'end_time'])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>question_id</th>\n",
       "      <th>is_correct</th>\n",
       "      <th>end_time</th>\n",
       "      <th>skill_id</th>\n",
       "      <th>skill_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>65010</th>\n",
       "      <td>100001</td>\n",
       "      <td>2026047</td>\n",
       "      <td>0</td>\n",
       "      <td>1535104376569</td>\n",
       "      <td>33685830</td>\n",
       "      <td>线段垂直平分线的性质</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65447</th>\n",
       "      <td>100001</td>\n",
       "      <td>2039324</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376595</td>\n",
       "      <td>33685920</td>\n",
       "      <td>轴对称图形</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65248</th>\n",
       "      <td>100001</td>\n",
       "      <td>2041243</td>\n",
       "      <td>0</td>\n",
       "      <td>1535104376598</td>\n",
       "      <td>33685920</td>\n",
       "      <td>轴对称图形</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65011</th>\n",
       "      <td>100001</td>\n",
       "      <td>2043477</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376602</td>\n",
       "      <td>33685920</td>\n",
       "      <td>轴对称图形</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65448</th>\n",
       "      <td>100001</td>\n",
       "      <td>2050662</td>\n",
       "      <td>0</td>\n",
       "      <td>1535104376618</td>\n",
       "      <td>33685921</td>\n",
       "      <td>轴对称的性质</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      student_id question_id is_correct       end_time  skill_id  skill_name\n",
       "65010     100001     2026047          0  1535104376569  33685830  线段垂直平分线的性质\n",
       "65447     100001     2039324          1  1535104376595  33685920       轴对称图形\n",
       "65248     100001     2041243          0  1535104376598  33685920       轴对称图形\n",
       "65011     100001     2043477          1  1535104376602  33685920       轴对称图形\n",
       "65448     100001     2050662          0  1535104376618  33685921      轴对称的性质"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newDf.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1       69357\n",
       "0       48892\n",
       "3       21902\n",
       "2        4790\n",
       "4         352\n",
       "1.00       90\n",
       "0.00       65\n",
       "Name: is_correct, dtype: int64"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newDf.is_correct.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [],
   "source": [
    "newDf1 = newDf[newDf[\"is_correct\"].isin([\"1\", \"2\", \"3\"])]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/jiangxinyang848/anaconda3/envs/jiang/lib/python3.5/site-packages/ipykernel_launcher.py:1: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \"\"\"Entry point for launching an IPython kernel.\n",
      "/home/jiangxinyang848/anaconda3/envs/jiang/lib/python3.5/site-packages/ipykernel_launcher.py:2: SettingWithCopyWarning: \n",
      "A value is trying to be set on a copy of a slice from a DataFrame.\n",
      "Try using .loc[row_indexer,col_indexer] = value instead\n",
      "\n",
      "See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy\n",
      "  \n"
     ]
    }
   ],
   "source": [
    "newDf1[\"is_correct\"] = newDf1[\"is_correct\"].replace([\"2\", \"3\"], 0)\n",
    "newDf1[\"is_correct\"] = newDf1[\"is_correct\"].replace(\"1\", 1)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1    69357\n",
       "0    26692\n",
       "Name: is_correct, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newDf1.is_correct.value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "685"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "students = newDf1.student_id.tolist()\n",
    "len(set(students))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "267"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "skills = newDf1.skill_id.tolist()\n",
    "len(set(skills))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>question_id</th>\n",
       "      <th>is_correct</th>\n",
       "      <th>end_time</th>\n",
       "      <th>skill_id</th>\n",
       "      <th>skill_name</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>65447</th>\n",
       "      <td>100001</td>\n",
       "      <td>2039324</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376595</td>\n",
       "      <td>33685920</td>\n",
       "      <td>轴对称图形</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65011</th>\n",
       "      <td>100001</td>\n",
       "      <td>2043477</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376602</td>\n",
       "      <td>33685920</td>\n",
       "      <td>轴对称图形</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65251</th>\n",
       "      <td>100001</td>\n",
       "      <td>2080269</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376629</td>\n",
       "      <td>33685921</td>\n",
       "      <td>轴对称的性质</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65252</th>\n",
       "      <td>100001</td>\n",
       "      <td>2080269</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376629</td>\n",
       "      <td>33685827</td>\n",
       "      <td>全等三角形的判定与性质</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65253</th>\n",
       "      <td>100001</td>\n",
       "      <td>2080269</td>\n",
       "      <td>1</td>\n",
       "      <td>1535104376629</td>\n",
       "      <td>33685869</td>\n",
       "      <td>正方形的性质</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      student_id question_id  is_correct       end_time  skill_id   skill_name\n",
       "65447     100001     2039324           1  1535104376595  33685920        轴对称图形\n",
       "65011     100001     2043477           1  1535104376602  33685920        轴对称图形\n",
       "65251     100001     2080269           1  1535104376629  33685921       轴对称的性质\n",
       "65252     100001     2080269           1  1535104376629  33685827  全等三角形的判定与性质\n",
       "65253     100001     2080269           1  1535104376629  33685869       正方形的性质"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "newDf1.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {},
   "outputs": [],
   "source": [
    "ktDf = newDf1[[\"student_id\", \"skill_id\", \"is_correct\"]]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>student_id</th>\n",
       "      <th>skill_id</th>\n",
       "      <th>is_correct</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>65447</th>\n",
       "      <td>100001</td>\n",
       "      <td>33685920</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65011</th>\n",
       "      <td>100001</td>\n",
       "      <td>33685920</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65251</th>\n",
       "      <td>100001</td>\n",
       "      <td>33685921</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65252</th>\n",
       "      <td>100001</td>\n",
       "      <td>33685827</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>65253</th>\n",
       "      <td>100001</td>\n",
       "      <td>33685869</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "      student_id  skill_id  is_correct\n",
       "65447     100001  33685920           1\n",
       "65011     100001  33685920           1\n",
       "65251     100001  33685921           1\n",
       "65252     100001  33685827           1\n",
       "65253     100001  33685869           1"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "ktDf.head(5)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {},
   "outputs": [],
   "source": [
    "ktDf.to_csv(\"./data/knowledgeTracing.csv\", index=False)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.5.6"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
